# necessary imports
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import ast
import warnings
warnings.filterwarnings('ignore')
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
from gensim.test.utils import datapath
import sys
sys.path.append('../scripts/')
from lda import create_documents, run_ldas, get_topicwords_scores, get_tweet_topic
# read lda_tweets
df = pd.read_json('../../data/BTW17_Twitter/lda/preprocessed_lda_tweets.json')
df['username'] = df['user'].apply(lambda x: x.get('screen_name'))
df.head(3)
# join tweets from same author and same hashtag to one document
documents = create_documents(df)
# run lda for different num_topics
model_list, coherence_scores = run_ldas(documents, 50, 100)
coherence_scores['Coherence Score'] = coherence_scores.pop('c_v')
coherence_scores['Anzahl Topics'] = coherence_scores.pop('num_topics')
x, y = run_ldas(documents, 200, 201)
print(y)
Coherence alleine funktioniert nicht, siehe Experimente mit 200 und 201 Topics. Dementsprechend wird dies angereichert mit der Anzahl einzigartiger Topics.
n_topics = []
for model in model_list:
ntopics = []
for idx, topic in model.print_topics(-1):
ntopics.append(topic)
n_topics.append(len(set(ntopics)))
fig = make_subplots(rows=1, cols=2, horizontal_spacing=0.2)
fig.add_trace(go.Scatter(x=coherence_scores['Anzahl Topics'], y=coherence_scores['Coherence Score'], mode='lines',
name='Coherence Score',
line=dict(color=px.colors.qualitative.Antique[0])), row=1, col=1)
fig.add_trace(go.Scatter(x=list(range(50,101)), y=n_topics, mode='lines',
name='Anzahl eindeutiger Topics',
line=dict(color=px.colors.qualitative.Antique[1])), row=1, col=2)
fig.update_yaxes(title='Coherence Score', row=1, col=1)
fig.update_yaxes(title='Anzahl eindeutiger Topics', row=1, col=2)
fig.update_xaxes(title='Anzahl Topics', row=1, col=1)
fig.update_xaxes(title='Anzahl Topics', row=1, col=2)
fig.update_layout(font=dict(family='Computer Modern', color='black', size=15), template='simple_white')
fig.show()
# get best performing lda model
lda_model = model_list[47]
# Save model to disk.
temp_file = datapath('model')
lda_model.save(temp_file)
# print topics
for idx, topic in lda_model.print_topics(-1):
print('Topic: {} \nWords: {}'.format(idx, topic))
# get words and scores and save
topic_df = get_topicwords_scores(lda_model)
topic_df.to_json('../../data/BTW17_Twitter/lda/topics.json')
# get tweet topics
output_df = get_tweet_topic(lda_model, df)
# save lda_tweets
output_df.to_json('../../data/BTW17_Twitter/lda/preprocessed_lda_tweets_topics.json')
output_df.head(3)
# plot number of topics per hashtag
plot_df = output_df.groupby('tags', as_index=False)['topic'].nunique().sort_values(by='topic').reset_index()
plot_df.rename(columns={'tags':'Hashtag', 'topic':'Anzahl Topics'}, inplace=True)
fig = px.bar(plot_df, x='Anzahl Topics', y='Hashtag',
template='simple_white', color_discrete_sequence=px.colors.qualitative.Antique,
orientation='h')
fig.update_layout(font=dict(family='Computer Modern', color='black', size=15))
fig.show()
import pandas as pd
from tqdm.notebook import tqdm
import plotly.express as px
output_df = pd.read_json('../../data/BTW17_Twitter/lda/preprocessed_lda_tweets_topics.json')
topic_df = pd.read_json('../../data/BTW17_Twitter/lda/topics.json')
tmp_df = output_df[['tags', 'topic', 'topic_score']]
hashtags = tmp_df['tags'].unique().tolist()
topics = []
for i in tqdm(range(len(hashtags))):
hashtag = hashtags[i]
tmp = tmp_df[tmp_df['tags']==hashtag]
topic_index = tmp.groupby('topic', as_index=False).sum('topic_score')['topic_score'].idxmax()
topic = tmp.groupby('topic', as_index=False).sum('topic_score')['topic'][topic_index]
topics.append(topic)
hashtag_topics = pd.DataFrame(data={'hashtag': hashtags, 'topic': topics})
hashtag_topics = hashtag_topics.merge(topic_df, on='topic')
# save to json
hashtag_topics.to_json('../../data/BTW17_Twitter/lda/hashtag_topics.json')
hashtag_topics.head(3)
# plot number of topics per hashtag
plot_df = hashtag_topics.groupby('topic', as_index=False)['hashtag'].nunique().sort_values(by='topic').reset_index()
plot_df.rename(columns={'topic':'Topic', 'hashtag':'Anzahl Hashtags'}, inplace=True)
fig = px.bar(plot_df, x='Anzahl Hashtags', y='Topic',
template='simple_white', color_discrete_sequence=px.colors.qualitative.Antique,
orientation='h')
fig.update_layout(font=dict(family='Computer Modern', color='black', size=15))
fig.show()